STEP 1 - EXPLORATORY ANALYSIS
# first select mobility data for our target cities
in_mob <- vroom("2020_IN_Region_Mobility_Report.csv") %>% filter(sub_region_1=="Delhi")
## Rows: 190,085
## Columns: 14
## Delimiter: ","
## chr [5]: country_region_code, country_region, sub_region_1, sub_region_2, iso_3166_2_code
## dbl [6]: retail_and_recreation_percent_change_from_baseline, grocery_and_pharmacy_perce...
## lgl [2]: metro_area, census_fips_code
## date [1]: date
##
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
gb_mob <- vroom("2020_GB_Region_Mobility_Report.csv") %>% filter(sub_region_1=="Greater London")
## Rows: 119,169
## Columns: 14
## Delimiter: ","
## chr [5]: country_region_code, country_region, sub_region_1, sub_region_2, iso_3166_2_code
## dbl [6]: retail_and_recreation_percent_change_from_baseline, grocery_and_pharmacy_perce...
## lgl [2]: metro_area, census_fips_code
## date [1]: date
##
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
ch_mob <- vroom("2020_CH_Region_Mobility_Report.csv") %>% filter(sub_region_1=="Zurich")
## Rows: 7,466
## Columns: 14
## Delimiter: ","
## chr [4]: country_region_code, country_region, sub_region_1, iso_3166_2_code
## dbl [6]: retail_and_recreation_percent_change_from_baseline, grocery_and_pharmacy_perce...
## lgl [3]: sub_region_2, metro_area, census_fips_code
## date [1]: date
##
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
# select only past 36 months
london_air <- vroom("london-air-quality.csv") %>%
mutate(date = ymd(date),
year = year(date)) %>% filter(date>=as.Date("2018-01-01"))
## Rows: 2,516
## Columns: 7
## Delimiter: ","
## chr [1]: date
## dbl [6]: pm25, pm10, o3, no2, so2, co
##
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
delhi_air <- vroom("delhi-institute of tool engineering, wazirpur, delhi, delhi, india-air-quality.csv") %>%
mutate(date = ymd(date),
year = year(date)) %>% filter(date>=as.Date("2018-01-01"))
## Rows: 1,042
## Columns: 7
## Delimiter: ","
## chr [1]: date
## dbl [6]: pm25, pm10, o3, no2, so2, co
##
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
wuhan_air <- vroom("wuhan-air-quality.csv") %>%
mutate(date = ymd(date),
year = year(date)) %>% filter(date>=as.Date("2018-01-01"))
## Rows: 2,394
## Columns: 7
## Delimiter: ","
## chr [1]: date
## dbl [6]: pm25, pm10, o3, no2, so2, co
##
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
zurich_air <- vroom("zurich-kaserne, switzerland-air-quality.csv") %>%
mutate(date = ymd(date),
year = year(date)) %>% filter(date>=as.Date("2018-01-01"))
## Rows: 2,212
## Columns: 6
## Delimiter: ","
## chr [1]: date
## dbl [5]: pm25, pm10, o3, no2, so2
##
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
# describe the mobility data
skim(gb_mob)
Data summary
| Name |
gb_mob |
| Number of rows |
9758 |
| Number of columns |
14 |
| _______________________ |
|
| Column type frequency: |
|
| character |
5 |
| Date |
1 |
| logical |
2 |
| numeric |
6 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| country_region_code |
0 |
1.00 |
2 |
2 |
0 |
1 |
0 |
| country_region |
0 |
1.00 |
14 |
14 |
0 |
1 |
0 |
| sub_region_1 |
0 |
1.00 |
14 |
14 |
0 |
1 |
0 |
| sub_region_2 |
287 |
0.97 |
14 |
40 |
0 |
33 |
0 |
| iso_3166_2_code |
287 |
0.97 |
6 |
6 |
0 |
33 |
0 |
Variable type: Date
| date |
0 |
1 |
2020-02-15 |
2020-11-27 |
2020-07-07 |
287 |
Variable type: logical
| metro_area |
9758 |
0 |
NaN |
: |
| census_fips_code |
9758 |
0 |
NaN |
: |
Variable type: numeric
| retail_and_recreation_percent_change_from_baseline |
18 |
1.00 |
-45.02 |
26.85 |
-99 |
-68 |
-45 |
-24 |
90 |
▆▇▅▁▁ |
| grocery_and_pharmacy_percent_change_from_baseline |
12 |
1.00 |
-17.05 |
16.89 |
-91 |
-24 |
-14 |
-8 |
54 |
▁▁▇▂▁ |
| parks_percent_change_from_baseline |
812 |
0.92 |
35.66 |
61.54 |
-96 |
-4 |
24 |
69 |
457 |
▇▇▁▁▁ |
| transit_stations_percent_change_from_baseline |
0 |
1.00 |
-50.18 |
21.34 |
-97 |
-65 |
-52 |
-40 |
71 |
▃▇▂▁▁ |
| workplaces_percent_change_from_baseline |
132 |
0.99 |
-46.14 |
23.53 |
-91 |
-63 |
-52 |
-29 |
7 |
▂▇▅▃▂ |
| residential_percent_change_from_baseline |
393 |
0.96 |
16.64 |
9.90 |
-1 |
10 |
16 |
23 |
42 |
▆▇▇▅▂ |
Data summary
| Name |
ch_mob |
| Number of rows |
287 |
| Number of columns |
14 |
| _______________________ |
|
| Column type frequency: |
|
| character |
4 |
| Date |
1 |
| logical |
3 |
| numeric |
6 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| country_region_code |
0 |
1 |
2 |
2 |
0 |
1 |
0 |
| country_region |
0 |
1 |
11 |
11 |
0 |
1 |
0 |
| sub_region_1 |
0 |
1 |
6 |
6 |
0 |
1 |
0 |
| iso_3166_2_code |
0 |
1 |
5 |
5 |
0 |
1 |
0 |
Variable type: Date
| date |
0 |
1 |
2020-02-15 |
2020-11-27 |
2020-07-07 |
287 |
Variable type: logical
| sub_region_2 |
287 |
0 |
NaN |
: |
| metro_area |
287 |
0 |
NaN |
: |
| census_fips_code |
287 |
0 |
NaN |
: |
Variable type: numeric
| retail_and_recreation_percent_change_from_baseline |
0 |
1.00 |
-28.77 |
22.51 |
-89 |
-32 |
-20 |
-14.00 |
5 |
▁▂▁▇▃ |
| grocery_and_pharmacy_percent_change_from_baseline |
6 |
0.98 |
-1.67 |
13.95 |
-81 |
-5 |
-1 |
4.00 |
44 |
▁▁▂▇▁ |
| parks_percent_change_from_baseline |
23 |
0.92 |
41.32 |
53.09 |
-73 |
-4 |
36 |
85.25 |
161 |
▂▇▆▅▂ |
| transit_stations_percent_change_from_baseline |
0 |
1.00 |
-30.50 |
15.91 |
-73 |
-37 |
-29 |
-20.50 |
5 |
▂▃▇▇▂ |
| workplaces_percent_change_from_baseline |
0 |
1.00 |
-27.24 |
16.81 |
-88 |
-36 |
-28 |
-12.00 |
1 |
▁▁▅▇▆ |
| residential_percent_change_from_baseline |
0 |
1.00 |
9.35 |
7.75 |
-4 |
4 |
7 |
13.00 |
36 |
▅▇▃▂▁ |
Data summary
| Name |
in_mob |
| Number of rows |
3444 |
| Number of columns |
14 |
| _______________________ |
|
| Column type frequency: |
|
| character |
5 |
| Date |
1 |
| logical |
2 |
| numeric |
6 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| country_region_code |
0 |
1.00 |
2 |
2 |
0 |
1 |
0 |
| country_region |
0 |
1.00 |
5 |
5 |
0 |
1 |
0 |
| sub_region_1 |
0 |
1.00 |
5 |
5 |
0 |
1 |
0 |
| sub_region_2 |
287 |
0.92 |
8 |
16 |
0 |
11 |
0 |
| iso_3166_2_code |
3157 |
0.08 |
5 |
5 |
0 |
1 |
0 |
Variable type: Date
| date |
0 |
1 |
2020-02-15 |
2020-11-27 |
2020-07-07 |
287 |
Variable type: logical
| metro_area |
3444 |
0 |
NaN |
: |
| census_fips_code |
3444 |
0 |
NaN |
: |
Variable type: numeric
| retail_and_recreation_percent_change_from_baseline |
0 |
1 |
-49.84 |
25.36 |
-96 |
-70.00 |
-48 |
-34 |
14 |
▅▅▇▂▂ |
| grocery_and_pharmacy_percent_change_from_baseline |
0 |
1 |
-20.24 |
28.53 |
-90 |
-40.00 |
-15 |
-1 |
89 |
▂▅▇▁▁ |
| parks_percent_change_from_baseline |
0 |
1 |
-66.35 |
28.37 |
-99 |
-86.25 |
-73 |
-59 |
31 |
▇▆▁▁▁ |
| transit_stations_percent_change_from_baseline |
0 |
1 |
-47.02 |
26.91 |
-95 |
-69.00 |
-47 |
-27 |
21 |
▆▇▇▅▂ |
| workplaces_percent_change_from_baseline |
0 |
1 |
-41.99 |
23.30 |
-88 |
-54.00 |
-40 |
-29 |
9 |
▃▂▇▃▂ |
| residential_percent_change_from_baseline |
0 |
1 |
16.45 |
9.55 |
-1 |
10.00 |
14 |
22 |
39 |
▃▇▅▂▂ |
# describe/view air quality data
skim(london_air)
Data summary
| Name |
london_air |
| Number of rows |
1062 |
| Number of columns |
8 |
| _______________________ |
|
| Column type frequency: |
|
| Date |
1 |
| numeric |
7 |
| ________________________ |
|
| Group variables |
None |
Variable type: Date
| date |
0 |
1 |
2018-01-01 |
2020-12-01 |
2019-06-16 |
1062 |
Variable type: numeric
| pm25 |
2 |
1.00 |
55.51 |
20.31 |
19 |
42 |
52 |
63 |
159 |
▆▇▂▁▁ |
| pm10 |
3 |
1.00 |
23.21 |
9.51 |
6 |
17 |
21 |
27 |
64 |
▅▇▂▁▁ |
| o3 |
4 |
1.00 |
25.08 |
10.33 |
1 |
19 |
24 |
30 |
73 |
▂▇▃▁▁ |
| no2 |
3 |
1.00 |
33.04 |
14.04 |
5 |
22 |
32 |
42 |
82 |
▅▇▆▂▁ |
| so2 |
124 |
0.88 |
3.07 |
2.02 |
1 |
2 |
3 |
4 |
16 |
▇▂▁▁▁ |
| co |
16 |
0.98 |
5.20 |
2.80 |
1 |
3 |
5 |
7 |
15 |
▇▇▅▂▁ |
| year |
0 |
1.00 |
2018.97 |
0.81 |
2018 |
2018 |
2019 |
2020 |
2020 |
▇▁▇▁▇ |
Data summary
| Name |
zurich_air |
| Number of rows |
1056 |
| Number of columns |
7 |
| _______________________ |
|
| Column type frequency: |
|
| Date |
1 |
| numeric |
6 |
| ________________________ |
|
| Group variables |
None |
Variable type: Date
| date |
0 |
1 |
2018-01-07 |
2020-12-01 |
2019-06-19 |
1056 |
Variable type: numeric
| pm25 |
4 |
1.00 |
1.97 |
1.17 |
1 |
1.00 |
2 |
2 |
8 |
▇▂▁▁▁ |
| pm10 |
4 |
1.00 |
12.96 |
7.58 |
1 |
7.00 |
12 |
17 |
51 |
▇▆▂▁▁ |
| o3 |
6 |
0.99 |
30.23 |
14.86 |
1 |
20.25 |
30 |
40 |
82 |
▃▇▆▂▁ |
| no2 |
6 |
0.99 |
9.82 |
5.32 |
1 |
6.00 |
9 |
13 |
30 |
▆▇▃▁▁ |
| so2 |
1000 |
0.05 |
1.05 |
0.23 |
1 |
1.00 |
1 |
1 |
2 |
▇▁▁▁▁ |
| year |
0 |
1.00 |
2018.98 |
0.81 |
2018 |
2018.00 |
2019 |
2020 |
2020 |
▇▁▇▁▇ |
Data summary
| Name |
wuhan_air |
| Number of rows |
1047 |
| Number of columns |
8 |
| _______________________ |
|
| Column type frequency: |
|
| Date |
1 |
| numeric |
7 |
| ________________________ |
|
| Group variables |
None |
Variable type: Date
| date |
0 |
1 |
2018-01-16 |
2020-12-01 |
2019-06-24 |
1047 |
Variable type: numeric
| pm25 |
9 |
0.99 |
124.92 |
36.04 |
29 |
99 |
124 |
150 |
291 |
▂▇▆▁▁ |
| pm10 |
8 |
0.99 |
59.99 |
25.51 |
6 |
45 |
57 |
72 |
328 |
▇▃▁▁▁ |
| o3 |
11 |
0.99 |
48.83 |
27.90 |
1 |
28 |
43 |
65 |
204 |
▇▇▂▁▁ |
| no2 |
11 |
0.99 |
25.43 |
11.27 |
4 |
17 |
23 |
32 |
67 |
▅▇▃▂▁ |
| so2 |
9 |
0.99 |
5.30 |
3.06 |
1 |
3 |
4 |
7 |
21 |
▇▃▁▁▁ |
| co |
10 |
0.99 |
10.88 |
3.73 |
4 |
8 |
10 |
13 |
36 |
▇▆▁▁▁ |
| year |
0 |
1.00 |
2018.99 |
0.81 |
2018 |
2018 |
2019 |
2020 |
2020 |
▇▁▇▁▇ |
Data summary
| Name |
delhi_air |
| Number of rows |
1042 |
| Number of columns |
8 |
| _______________________ |
|
| Column type frequency: |
|
| Date |
1 |
| numeric |
7 |
| ________________________ |
|
| Group variables |
None |
Variable type: Date
| date |
0 |
1 |
2018-01-19 |
2020-12-03 |
2019-06-25 |
1042 |
Variable type: numeric
| pm25 |
13 |
0.99 |
186.23 |
87.84 |
26 |
130 |
164 |
219.00 |
828 |
▇▃▁▁▁ |
| pm10 |
28 |
0.97 |
191.87 |
133.69 |
29 |
99 |
146 |
242.75 |
979 |
▇▂▁▁▁ |
| o3 |
17 |
0.98 |
20.43 |
16.25 |
1 |
7 |
17 |
30.00 |
108 |
▇▃▁▁▁ |
| no2 |
25 |
0.98 |
21.69 |
10.60 |
1 |
14 |
20 |
27.00 |
66 |
▅▇▂▁▁ |
| so2 |
29 |
0.97 |
9.35 |
4.69 |
1 |
6 |
9 |
12.00 |
40 |
▇▇▁▁▁ |
| co |
19 |
0.98 |
13.64 |
6.64 |
2 |
9 |
12 |
16.00 |
75 |
▇▂▁▁▁ |
| year |
0 |
1.00 |
2018.99 |
0.81 |
2018 |
2018 |
2019 |
2020.00 |
2020 |
▇▁▇▁▇ |
# check for dupes
london_air%>%get_dupes(date)
## No duplicate combinations found of: date
## # A tibble: 0 x 9
## # … with 9 variables: date <date>, dupe_count <int>, pm25 <dbl>, pm10 <dbl>,
## # o3 <dbl>, no2 <dbl>, so2 <dbl>, co <dbl>, year <dbl>
wuhan_air%>%get_dupes(date)
## No duplicate combinations found of: date
## # A tibble: 0 x 9
## # … with 9 variables: date <date>, dupe_count <int>, pm25 <dbl>, pm10 <dbl>,
## # o3 <dbl>, no2 <dbl>, so2 <dbl>, co <dbl>, year <dbl>
zurich_air %>%get_dupes(date)
## No duplicate combinations found of: date
## # A tibble: 0 x 8
## # … with 8 variables: date <date>, dupe_count <int>, pm25 <dbl>, pm10 <dbl>,
## # o3 <dbl>, no2 <dbl>, so2 <dbl>, year <dbl>
delhi_air %>% get_dupes(date)
## No duplicate combinations found of: date
## # A tibble: 0 x 9
## # … with 9 variables: date <date>, dupe_count <int>, pm25 <dbl>, pm10 <dbl>,
## # o3 <dbl>, no2 <dbl>, so2 <dbl>, co <dbl>, year <dbl>
# dupes in mobility?
in_mob %>% get_dupes(date)
## # A tibble: 3,444 x 15
## date dupe_count country_region_… country_region sub_region_1
## <date> <int> <chr> <chr> <chr>
## 1 2020-02-15 12 IN India Delhi
## 2 2020-02-15 12 IN India Delhi
## 3 2020-02-15 12 IN India Delhi
## 4 2020-02-15 12 IN India Delhi
## 5 2020-02-15 12 IN India Delhi
## 6 2020-02-15 12 IN India Delhi
## 7 2020-02-15 12 IN India Delhi
## 8 2020-02-15 12 IN India Delhi
## 9 2020-02-15 12 IN India Delhi
## 10 2020-02-15 12 IN India Delhi
## # … with 3,434 more rows, and 10 more variables: sub_region_2 <chr>,
## # metro_area <lgl>, iso_3166_2_code <chr>, census_fips_code <lgl>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>,
## # residential_percent_change_from_baseline <dbl>
ch_mob %>% get_dupes(date)
## No duplicate combinations found of: date
## # A tibble: 0 x 15
## # … with 15 variables: date <date>, dupe_count <int>,
## # country_region_code <chr>, country_region <chr>, sub_region_1 <chr>,
## # sub_region_2 <lgl>, metro_area <lgl>, iso_3166_2_code <chr>,
## # census_fips_code <lgl>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>,
## # residential_percent_change_from_baseline <dbl>
gb_mob %>% get_dupes(date)
## # A tibble: 9,758 x 15
## date dupe_count country_region_… country_region sub_region_1
## <date> <int> <chr> <chr> <chr>
## 1 2020-02-15 34 GB United Kingdom Greater Lon…
## 2 2020-02-15 34 GB United Kingdom Greater Lon…
## 3 2020-02-15 34 GB United Kingdom Greater Lon…
## 4 2020-02-15 34 GB United Kingdom Greater Lon…
## 5 2020-02-15 34 GB United Kingdom Greater Lon…
## 6 2020-02-15 34 GB United Kingdom Greater Lon…
## 7 2020-02-15 34 GB United Kingdom Greater Lon…
## 8 2020-02-15 34 GB United Kingdom Greater Lon…
## 9 2020-02-15 34 GB United Kingdom Greater Lon…
## 10 2020-02-15 34 GB United Kingdom Greater Lon…
## # … with 9,748 more rows, and 10 more variables: sub_region_2 <chr>,
## # metro_area <lgl>, iso_3166_2_code <chr>, census_fips_code <lgl>,
## # retail_and_recreation_percent_change_from_baseline <dbl>,
## # grocery_and_pharmacy_percent_change_from_baseline <dbl>,
## # parks_percent_change_from_baseline <dbl>,
## # transit_stations_percent_change_from_baseline <dbl>,
## # workplaces_percent_change_from_baseline <dbl>,
## # residential_percent_change_from_baseline <dbl>
# fix duplicates across sub regions by averaging the values
in_mob_clean <- in_mob %>% group_by(date) %>%
summarise(retail_and_recreation_percent_change_from_baseline=mean(retail_and_recreation_percent_change_from_baseline),
grocery_and_pharmacy_percent_change_from_baseline=mean(grocery_and_pharmacy_percent_change_from_baseline),
parks_percent_change_from_baseline=mean(parks_percent_change_from_baseline),
transit_stations_percent_change_from_baseline=mean(transit_stations_percent_change_from_baseline),
workplaces_percent_change_from_baseline=mean(workplaces_percent_change_from_baseline),
residential_percent_change_from_baseline=mean(residential_percent_change_from_baseline)) %>% ungroup()
## `summarise()` ungrouping output (override with `.groups` argument)
gb_mob_clean <- gb_mob %>% group_by(date) %>%
summarise(retail_and_recreation_percent_change_from_baseline=mean(retail_and_recreation_percent_change_from_baseline),
grocery_and_pharmacy_percent_change_from_baseline=mean(grocery_and_pharmacy_percent_change_from_baseline),
parks_percent_change_from_baseline=mean(parks_percent_change_from_baseline),
transit_stations_percent_change_from_baseline=mean(transit_stations_percent_change_from_baseline),
workplaces_percent_change_from_baseline=mean(workplaces_percent_change_from_baseline),
residential_percent_change_from_baseline=mean(na.omit(residential_percent_change_from_baseline))) %>% ungroup()
## `summarise()` ungrouping output (override with `.groups` argument)
# lets look into missing data
md.pattern(in_mob_clean,rotate.names = T)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'

## date retail_and_recreation_percent_change_from_baseline
## 287 1 1
## 0 0
## grocery_and_pharmacy_percent_change_from_baseline
## 287 1
## 0
## parks_percent_change_from_baseline
## 287 1
## 0
## transit_stations_percent_change_from_baseline
## 287 1
## 0
## workplaces_percent_change_from_baseline
## 287 1
## 0
## residential_percent_change_from_baseline
## 287 1 0
## 0 0
md.pattern(gb_mob_clean,rotate.names = T)

## date transit_stations_percent_change_from_baseline
## 48 1 1
## 137 1 1
## 76 1 1
## 4 1 1
## 16 1 1
## 6 1 1
## 0 0
## residential_percent_change_from_baseline
## 48 1
## 137 1
## 76 1
## 4 1
## 16 1
## 6 1
## 0
## grocery_and_pharmacy_percent_change_from_baseline
## 48 1
## 137 1
## 76 1
## 4 1
## 16 1
## 6 0
## 6
## retail_and_recreation_percent_change_from_baseline
## 48 1
## 137 1
## 76 1
## 4 1
## 16 0
## 6 1
## 16
## workplaces_percent_change_from_baseline parks_percent_change_from_baseline
## 48 1 1
## 137 1 0
## 76 0 1
## 4 0 0
## 16 0 0
## 6 0 0
## 102 163
##
## 48 0
## 137 1
## 76 1
## 4 2
## 16 3
## 6 3
## 287
md.pattern(ch_mob,rotate.names = T)

## country_region_code country_region sub_region_1 iso_3166_2_code date
## 259 1 1 1 1 1
## 22 1 1 1 1 1
## 5 1 1 1 1 1
## 1 1 1 1 1 1
## 0 0 0 0 0
## retail_and_recreation_percent_change_from_baseline
## 259 1
## 22 1
## 5 1
## 1 1
## 0
## transit_stations_percent_change_from_baseline
## 259 1
## 22 1
## 5 1
## 1 1
## 0
## workplaces_percent_change_from_baseline
## 259 1
## 22 1
## 5 1
## 1 1
## 0
## residential_percent_change_from_baseline
## 259 1
## 22 1
## 5 1
## 1 1
## 0
## grocery_and_pharmacy_percent_change_from_baseline
## 259 1
## 22 1
## 5 0
## 1 0
## 6
## parks_percent_change_from_baseline sub_region_2 metro_area census_fips_code
## 259 1 0 0 0
## 22 0 0 0 0
## 5 1 0 0 0
## 1 0 0 0 0
## 23 287 287 287
##
## 259 3
## 22 4
## 5 4
## 1 5
## 890
# merge the mobility and air quality data
lon_merged <- left_join(london_air,gb_mob_clean,by='date')
del_merged <- left_join(delhi_air,in_mob_clean,by='date')
zur_merged <- left_join(zurich_air,ch_mob,by='date')
Explorary Data Analysis
# some visualizations
lon_air1 <- lon_merged %>% pivot_longer(cols=c(2:7),names_to="Pollutions",values_to="ugm")%>%
filter(Pollutions %in% c("pm25","pm10","no2","so2")) %>%mutate(city = "London")
plt1<- ggplot(lon_air1) +
geom_smooth(aes(x=date,y=ugm,color=Pollutions)) +
labs(x="", y="Concentration (µg/m3)")+
facet_wrap(~ city) +
theme_classic()+
theme(strip.text.x = element_text(size = 16)) +
theme(legend.position = "none",
strip.text.x = element_text(),
strip.text.y = element_text(),
strip.background = element_rect(fill="#E1E1E1"))
del_air1 <- del_merged %>% pivot_longer(cols=c(2:7),names_to="Pollutions",values_to="ugm")%>%
filter(Pollutions %in% c("pm25","pm10","no2","so2"))%>%mutate(city = "New Delhi")
plt2<- ggplot(del_air1) +
geom_smooth(aes(x=date,y=ugm,color=Pollutions)) +
labs(x="", y="Concentration (µg/m3)")+
facet_wrap(~ city) +
theme_classic()+
theme(strip.text.x = element_text(size = 16)) +
theme(legend.position = "none",
strip.text.x = element_text(),
strip.text.y = element_text(),
strip.background = element_rect(fill="#E1E1E1"))
zur_air1 <- zur_merged %>% pivot_longer(cols=c(2:7),names_to="Pollutions",values_to="ugm")%>%
filter(Pollutions %in% c("pm25","pm10","no2","so2"))%>%mutate(city = "Zurich")
plt3<- ggplot(zur_air1) +
geom_smooth(aes(x=date,y=ugm,color=Pollutions)) +
labs(x="", y="Concentration (µg/m3)") +
facet_wrap(~ city) +
theme_classic()+
theme(strip.text.x = element_text(size = 16)) +
theme(
strip.text.x = element_text(),
strip.text.y = element_text(),
strip.background = element_rect(fill="#E1E1E1"))
plt1+plt2+plt3+
plot_annotation(title = "Air pollutant concentrations reached their lowest points in three years in July 2020",subtitle = "Air quality data of London, New Delhi and Zürich over the past three years",caption="Aqicn.org") &
theme(
plot.title = element_text(face = "bold", size = 20, hjust = 0),
#hjust is to adjust the position of the title
plot.subtitle = element_text(size = 16, hjust = 0)
) &
theme(panel.background = element_rect(fill = NA,color="black"), # Remove gridlines
plot.title = element_text(, # Change text format
face = "bold",
size = 18),
plot.subtitle = element_text(
size = 16),
axis.title.x = element_blank(),
axis.text.x = element_text(),
axis.title.y = element_text(),
axis.text.y = element_text(),
aspect.ratio = 3/5)

lon_no2 <- london_air %>% summarise(date=date,London=no2)
del_no2 <- delhi_air %>% summarise(date=date,New_Delhi=no2)
zur_no2 <- zurich_air %>% summarise(date=date,Zurich=no2)
no2_merged <- left_join(lon_no2,del_no2,by='date')
no2_merged <- left_join(no2_merged,zur_no2,by='date')%>% pivot_longer(cols=c(2:4),names_to="city",values_to="no2")%>% mutate(component = "NO2")
plt4 <- ggplot(no2_merged) +
geom_smooth(aes(x=date,y=no2,color=city)) +
labs(x="", y="Concentration (µg/m3)")+
facet_wrap(~ component) +
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
theme(legend.position = "none",
strip.text.x = element_text(),
strip.text.y = element_text(),
strip.background = element_rect(fill="#E1E1E1"))
lon_pm10 <- london_air %>% summarise(date=date,London=pm10)
del_pm10 <- delhi_air %>% summarise(date=date,New_Delhi=pm10)
zur_pm10 <- zurich_air %>% summarise(date=date,Zurich=pm10)
pm10_merged <- left_join(lon_pm10,del_pm10,by='date')
pm10_merged <- left_join(pm10_merged,zur_pm10,by='date')%>% pivot_longer(cols=c(2:4),names_to="city",values_to="pm10")%>% mutate(component = "PM10")
plt5 <- ggplot(pm10_merged) +
geom_smooth(aes(x=date,y=pm10,color=city)) +
labs(x="", y="Concentration (µg/m3)")+
facet_wrap(~ component) +
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
theme(
strip.text.x = element_text(),
strip.text.y = element_text(),
strip.background = element_rect(fill="#E1E1E1"))
lon_pm25 <- london_air %>% summarise(date=date,London=pm25)
del_pm25 <- delhi_air %>% summarise(date=date,New_Delhi=pm25)
zur_pm25 <- zurich_air %>% summarise(date=date,Zurich=pm25)
pm25_merged <- left_join(lon_pm25,del_pm25,by='date')
pm25_merged <- left_join(pm25_merged,zur_pm25,by='date')%>% pivot_longer(cols=c(2:4),names_to="city",values_to="pm25")%>% mutate(component = "PM25")
plt6 <- ggplot(pm25_merged) +
geom_smooth(aes(x=date,y=pm25,color=city)) +
labs(x="", y="Concentration (µg/m3)")+
facet_wrap(~ component) +
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
theme(legend.position = "none",
strip.text.x = element_text(),
strip.text.y = element_text(),
strip.background = element_rect(fill="#E1E1E1"))
lon_so2 <- london_air %>% summarise(date=date,London=so2)
del_so2 <- delhi_air %>% summarise(date=date,New_Delhi=so2)
zur_so2 <- zurich_air %>% summarise(date=date,Zurich=so2)
so2_merged <- left_join(lon_so2,del_so2,by='date')
so2_merged <- left_join(so2_merged,zur_so2,by='date')%>% pivot_longer(cols=c(2:4),names_to="city",values_to="so2")%>% mutate(component = "SO2")
plt7 <- ggplot(so2_merged) +
geom_smooth(aes(x=date,y=so2,color=city)) +
labs(x="", y="Concentration (µg/m3)")+
facet_wrap(~ component) +
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
theme(
strip.text.x = element_text(),
strip.text.y = element_text(),
strip.background = element_rect(fill="#E1E1E1"))
plt4+plt5+plt6+plt7+
plot_annotation(title = "New Delhi has the most variable levels of pollutants throughout the year", subtitle = "Air quality data of London, New Delhi and Zürich over the past three years", caption="Aqicn.org") &
theme(
plot.title = element_text(face = "bold", size = 20, hjust = 0),
#hjust is to adjust the position of the title
plot.subtitle = element_text(size = 16, hjust = 0)
) &
theme(
plot.title = element_text(face = "bold", size = 20, hjust = 0),
#hjust is to adjust the position of the title
plot.subtitle = element_text(size = 16, hjust = 0)
) &
theme(panel.background = element_rect(fill = NA,color="black"), # Remove gridlines
plot.title = element_text(, # Change text format
face = "bold",
size = 18),
plot.subtitle = element_text(,
size = 16),
axis.title.x = element_blank(),
axis.text.x = element_text(),
axis.title.y = element_text(),
axis.text.y = element_text(),
aspect.ratio = 3/5)

# some visualizations
lon_pm25 <- lon_merged %>% mutate(city ="London")
plta <- ggplot(lon_pm25,aes(x=date,y=pm25,color=year))+
geom_smooth() +
geom_point() +
facet_wrap(~city)+
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
theme(legend.position = "none") +
labs(x="")
del_pm25 <- del_merged %>% mutate(city="New Delhi")
pltb <- ggplot(del_pm25,aes(x=date,y=pm25,color=year)) +
geom_smooth() +
geom_point() +
facet_wrap(~city)+
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
theme(legend.position = "none")+
labs(x="")
zur_pm25 <- zur_merged %>% mutate(city="Zurich")
pltc <- ggplot(zur_pm25,aes(x=date,y=pm25,color=year)) +
geom_smooth() +
geom_point() +
facet_wrap(~city)+
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
labs(x="")
plta+pltb+pltc+
plot_annotation(title = " ", subtitle = "PM2.5 data of London, New Delhi and Zürich over the past three years", caption="Aqicn.org") &
theme(
plot.title = element_text(face = "bold", size = 20, hjust = 0),
#hjust is to adjust the position of the title
plot.subtitle = element_text(size = 16, hjust = 0)
)

lon_long <- lon_merged %>%
pivot_longer(cols=c(9:14),names_to="type",values_to="percent") %>% filter(date>=as.Date("2020-02-15"))
del_long <- del_merged %>% pivot_longer(cols=c(9:14),names_to="type",values_to="percent") %>% filter(date>=as.Date("2020-02-15"))
zur_long <- zur_merged %>% pivot_longer(cols=c(15:20),names_to="type",values_to="percent") %>% filter(date>=as.Date("2020-02-15"))
lon_mob <- lon_long %>% mutate (city = "London")
plt8 <- ggplot(lon_mob) +
geom_smooth(aes(x=date,y=percent,color=type)) +
labs(x="", y="Percent change from baseline")+
facet_wrap(~ city) +
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
theme(legend.position = "none") +
theme(
strip.text.x = element_text(),
strip.text.y = element_text(),
strip.background = element_rect(fill="#E1E1E1"))
del_mob <- del_long %>% mutate (city = "New Delhi")
plt9 <- ggplot(del_mob) +
geom_smooth(aes(x=date,y=percent,color=type)) +
labs(x="", y="Percent change from baseline")+
facet_wrap(~ city) +
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
theme(legend.position = "none") +
theme(
strip.text.x = element_text(),
strip.text.y = element_text(),
strip.background = element_rect(fill="#E1E1E1"))
zur_mob <- zur_merged %>%
summarise(
date = date,
Retail = retail_and_recreation_percent_change_from_baseline,
Grocery = grocery_and_pharmacy_percent_change_from_baseline,
Parks = parks_percent_change_from_baseline,
Transit = transit_stations_percent_change_from_baseline,
Workplaces = workplaces_percent_change_from_baseline,
Residential = residential_percent_change_from_baseline
) %>%pivot_longer(cols=c(2:7),names_to="type",values_to="percent") %>%filter(date>=as.Date("2020-02-15")) %>% mutate (city = "Zurich")
plt10 <- ggplot(zur_mob) +
geom_smooth(aes(x=date,y=percent,color=type)) +
labs(x="", y="Percent change from baseline")+
facet_wrap(~ city) +
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
theme(
strip.text.x = element_text( ),
strip.text.y = element_text( ),
strip.background = element_rect(fill="#E1E1E1"))
plt8+plt9+plt10+
plot_annotation(title = "Mobility Patterns Differed By City", subtitle = "Mobility quality data of London, New Delhi and Zürich since 15th Feberary", caption="Google") &
theme(
panel.background = element_rect(fill = NA,color="black"), # Remove gridlines
plot.title = element_text( # Change text format
face = "bold",
size = 18),
plot.subtitle = element_text(
size = 16),
axis.title.x = element_blank(),
axis.text.x = element_text( ),
axis.title.y = element_text( ),
axis.text.y = element_text( ),
aspect.ratio = 3/5)

# attempt one at percent change
del_pm2_averages <- del_merged %>% filter(year!=2020) %>% mutate(month=month(date),day=day(date)) %>% group_by(month,day) %>% summarise(base_pm25=mean(pm25))
del_long <- del_long %>% mutate(month=month(date),day=day(date)) %>%
merge(.,del_pm2_averages,by=c("month","day")) %>%
mutate(per_change_pm25=(pm25-base_pm25)/base_pm25*100)
lon_pm2_averages <- lon_merged %>% filter(year!=2020) %>% mutate(month=month(date),day=day(date)) %>% group_by(month,day) %>% summarise(base_pm25=mean(pm25))
lon_long <- lon_long %>% mutate(month=month(date),day=day(date)) %>%
merge(.,lon_pm2_averages,by=c("month","day")) %>%
mutate(per_change_pm25=(pm25-base_pm25)/base_pm25*100)
zur_pm2_averages <- zur_merged %>% filter(year!=2020) %>%
mutate(month=month(date),day=day(date)) %>% group_by(month,day) %>% summarise(base_pm25=mean(pm25))
zur_long <- zur_long %>% mutate(month=month(date),day=day(date)) %>%
merge(.,zur_pm2_averages,by=c("month","day")) %>%
mutate(per_change_pm25=(pm25-base_pm25)/base_pm25*100)
lon_pm25_change <- lon_long %>% summarise(date=date,London=per_change_pm25)
del_pm25_change <- del_long %>% summarise(date=date,New_Delhi=per_change_pm25)
zur_pm25_change <- zur_long %>% summarise(date=date,Zurich=per_change_pm25)
pm25_change_merged <- left_join(lon_pm25_change,del_pm25_change,by='date')
pm25_change_merged <- left_join(pm25_change_merged,zur_pm25_change,by='date')%>% pivot_longer(cols=c(2:4),names_to="city",values_to="per_change_pm25") %>% mutate(month=month(date))%>%
group_by(month,city)%>% summarise(mean_change_pm25 = mean(per_change_pm25))
plt11 <- ggplot(pm25_change_merged,aes(x=as.factor(month),y=mean_change_pm25)) +
geom_bar(stat="identity",aes(color=city,fill=city))+
labs(x="", y="Percent change of PM25 from baseline")+
facet_wrap(~city) +
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
labs(title ="Percent change of PM2.5 from baseline in 2020", subtitle = "PM2.5 data of London, New Delhi and Zürich at 2020") +
scale_x_discrete(breaks= c(2:12),labels= c("Feb", "Mar", "Apr","May", "Jun","July","Aug","Sep","Oct","Nov","Dec"))+
theme(
plot.title = element_text(face = "bold", size = 20, hjust = 0),
#hjust is to adjust the position of the title
plot.subtitle = element_text(size = 16, hjust = 0)
) +
theme(legend.position = "none")
lon_pm25 <- lon_long %>% summarise(date=date,London=pm25)
del_pm25 <- del_long %>% summarise(date=date,New_Delhi=pm25)
zur_pm25 <- zur_long %>% summarise(date=date,Zurich=pm25)
pm25_merged <- left_join(lon_pm25,del_pm25,by='date')
pm25_merged <- left_join(pm25_merged,zur_pm25,by='date')%>% pivot_longer(cols=c(2:4),names_to="city",values_to="pm25") %>% mutate(year = year(date),month=month(date))%>%filter(year==2020)%>%
group_by(year,month,city)%>% summarise(mean_pm25 = mean(pm25))
plt12 <- ggplot(pm25_merged,aes(x=as.factor(month),y=mean_pm25,label=round(mean_pm25,2))) +
geom_bar(stat="identity",aes(color=city,fill=city))+
geom_text(vjust = 0) + # add figures on each column
labs(x="", y="Mean PM2.5 concentration")+
coord_flip()+
facet_wrap(~city,scales = "free") +
theme_minimal()+
theme(strip.text.x = element_text(size = 16)) +
labs(title ="Mean PM2.5 concentration in 2020") +
scale_x_discrete(breaks= c(2:12),labels= c("Feb", "Mar", "Apr","May", "Jun","July","Aug","Sep","Oct","Nov","Dec"))+
theme(
plot.title = element_text(face = "bold", size = 20, hjust = 0),
#hjust is to adjust the position of the title
plot.subtitle = element_text(size = 16, hjust = 0)
) +
theme(legend.position = "none")
plt11/plt12 +
theme(
plot.title = element_text(face = "bold", size = 20, hjust = 0),
#hjust is to adjust the position of the title
plot.subtitle = element_text(size = 16, hjust = 0) )

# plot pm2 vs workplace percent change
del_long %>%
filter(date>=as.Date("2020-05-01")) %>%
ggplot(aes(y=per_change_pm25,x=percent)) +
geom_smooth() + facet_wrap(~type,scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 72 rows containing non-finite values (stat_smooth).

lon_long %>%
filter(date>=as.Date("2020-05-01")) %>%
ggplot(aes(y=per_change_pm25,x=percent)) +
geom_smooth() + facet_wrap(~type,scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 271 rows containing non-finite values (stat_smooth).

zur_long %>%
filter(date>=as.Date("2020-05-01")) %>%
ggplot(aes(y=per_change_pm25,x=percent)) +
geom_smooth() + facet_wrap(~type,scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 56 rows containing non-finite values (stat_smooth).

del_merged <- del_merged %>% mutate(month=month(date),day=day(date)) %>%
merge(.,del_pm2_averages,by=c("month","day")) %>%
mutate(per_change_pm25=(pm25-base_pm25)/base_pm25*100)
del_merged %>%
#filter(date>=as.Date("2020-01-01")) %>%
ggplot(aes(y=per_change_pm25,x=date)) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 26 rows containing non-finite values (stat_smooth).

lon_merged <- lon_merged %>% mutate(month=month(date),day=day(date)) %>%
merge(.,del_pm2_averages,by=c("month","day")) %>%
mutate(per_change_pm25=(pm25-base_pm25)/base_pm25*100)
lon_merged %>%
#filter(date>=as.Date("2020-01-01")) %>%
ggplot(aes(y=per_change_pm25,x=date)) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 24 rows containing non-finite values (stat_smooth).

zur_merged <- zur_merged %>% mutate(month=month(date),day=day(date)) %>%
merge(.,del_pm2_averages,by=c("month","day")) %>%
mutate(per_change_pm25=(pm25-base_pm25)/base_pm25*100)
zur_merged %>%
#filter(date>=as.Date("2020-01-01")) %>%
ggplot(aes(y=per_change_pm25,x=date)) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 25 rows containing non-finite values (stat_smooth).

Random Forest
# CHANGED: whole chunk
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## Loading required package: rpart
train_control <- trainControl (
method="cv",
number=5,
verboseIter=TRUE)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
set.seed(1234)
# Fit random forest: model
merged_nona <- del_merged %>% mutate(month=month(date))
merged_nona %>% ggcorr(layout.position="left")
## Warning in ggcorr(., layout.position = "left"): data in column(s) 'date' are not
## numeric and were ignored
## Warning in cor(data, use = method[1], method = method[2]): the standard
## deviation is zero
## Warning: Ignoring unknown parameters: layout.position

rf_RF <- train(
no2 ~ retail_and_recreation_percent_change_from_baseline+
grocery_and_pharmacy_percent_change_from_baseline+
transit_stations_percent_change_from_baseline+
retail_and_recreation_percent_change_from_baseline+
workplaces_percent_change_from_baseline+
residential_percent_change_from_baseline+
month,
merged_nona,
method = "ranger",
na.action = na.omit,
#metric="ROC",
trControl = train_control,
importance = 'permutation',
tuneLength=10
)
## note: only 5 unique complexity parameters in default grid. Truncating the grid to 5 .
##
## + Fold1: mtry=2, min.node.size=5, splitrule=variance
## - Fold1: mtry=2, min.node.size=5, splitrule=variance
## + Fold1: mtry=3, min.node.size=5, splitrule=variance
## - Fold1: mtry=3, min.node.size=5, splitrule=variance
## + Fold1: mtry=4, min.node.size=5, splitrule=variance
## - Fold1: mtry=4, min.node.size=5, splitrule=variance
## + Fold1: mtry=5, min.node.size=5, splitrule=variance
## - Fold1: mtry=5, min.node.size=5, splitrule=variance
## + Fold1: mtry=6, min.node.size=5, splitrule=variance
## - Fold1: mtry=6, min.node.size=5, splitrule=variance
## + Fold1: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=2, min.node.size=5, splitrule=variance
## - Fold2: mtry=2, min.node.size=5, splitrule=variance
## + Fold2: mtry=3, min.node.size=5, splitrule=variance
## - Fold2: mtry=3, min.node.size=5, splitrule=variance
## + Fold2: mtry=4, min.node.size=5, splitrule=variance
## - Fold2: mtry=4, min.node.size=5, splitrule=variance
## + Fold2: mtry=5, min.node.size=5, splitrule=variance
## - Fold2: mtry=5, min.node.size=5, splitrule=variance
## + Fold2: mtry=6, min.node.size=5, splitrule=variance
## - Fold2: mtry=6, min.node.size=5, splitrule=variance
## + Fold2: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=2, min.node.size=5, splitrule=variance
## - Fold3: mtry=2, min.node.size=5, splitrule=variance
## + Fold3: mtry=3, min.node.size=5, splitrule=variance
## - Fold3: mtry=3, min.node.size=5, splitrule=variance
## + Fold3: mtry=4, min.node.size=5, splitrule=variance
## - Fold3: mtry=4, min.node.size=5, splitrule=variance
## + Fold3: mtry=5, min.node.size=5, splitrule=variance
## - Fold3: mtry=5, min.node.size=5, splitrule=variance
## + Fold3: mtry=6, min.node.size=5, splitrule=variance
## - Fold3: mtry=6, min.node.size=5, splitrule=variance
## + Fold3: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=2, min.node.size=5, splitrule=variance
## - Fold4: mtry=2, min.node.size=5, splitrule=variance
## + Fold4: mtry=3, min.node.size=5, splitrule=variance
## - Fold4: mtry=3, min.node.size=5, splitrule=variance
## + Fold4: mtry=4, min.node.size=5, splitrule=variance
## - Fold4: mtry=4, min.node.size=5, splitrule=variance
## + Fold4: mtry=5, min.node.size=5, splitrule=variance
## - Fold4: mtry=5, min.node.size=5, splitrule=variance
## + Fold4: mtry=6, min.node.size=5, splitrule=variance
## - Fold4: mtry=6, min.node.size=5, splitrule=variance
## + Fold4: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=2, min.node.size=5, splitrule=variance
## - Fold5: mtry=2, min.node.size=5, splitrule=variance
## + Fold5: mtry=3, min.node.size=5, splitrule=variance
## - Fold5: mtry=3, min.node.size=5, splitrule=variance
## + Fold5: mtry=4, min.node.size=5, splitrule=variance
## - Fold5: mtry=4, min.node.size=5, splitrule=variance
## + Fold5: mtry=5, min.node.size=5, splitrule=variance
## - Fold5: mtry=5, min.node.size=5, splitrule=variance
## + Fold5: mtry=6, min.node.size=5, splitrule=variance
## - Fold5: mtry=6, min.node.size=5, splitrule=variance
## + Fold5: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=6, min.node.size=5, splitrule=extratrees
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = extratrees, min.node.size = 5 on full training set
## mtry min.node.size splitrule RMSE Rsquared MAE RMSESD
## 1 2 5 variance 5.895659 0.5332000 3.634501 1.828347
## 2 2 5 extratrees 5.642804 0.5763548 3.428968 1.599914
## 3 3 5 variance 5.915773 0.5286914 3.674713 1.808358
## 4 3 5 extratrees 5.710576 0.5710011 3.439608 1.597044
## 5 4 5 variance 5.903694 0.5286719 3.699773 1.860870
## 6 4 5 extratrees 5.757206 0.5694111 3.471320 1.566789
## 7 5 5 variance 5.942272 0.5239263 3.722403 1.887731
## 8 5 5 extratrees 5.703927 0.5737807 3.454761 1.583589
## 9 6 5 variance 5.945269 0.5217063 3.747305 1.968981
## 10 6 5 extratrees 5.759848 0.5681858 3.463326 1.506072
## RsquaredSD MAESD
## 1 0.2264378 0.9307786
## 2 0.2057696 0.8127352
## 3 0.2217819 0.9236894
## 4 0.2094391 0.8132411
## 5 0.2241502 0.9500026
## 6 0.2081735 0.7982849
## 7 0.2256370 1.0019769
## 8 0.2060450 0.7893223
## 9 0.2319573 1.0330969
## 10 0.1997430 0.7576583
## mtry splitrule min.node.size
## 2 2 extratrees 5
importance <- varImp(rf_RF, scale=TRUE)
importance
## ranger variable importance
##
## Overall
## month 100.00
## retail_and_recreation_percent_change_from_baseline 86.97
## transit_stations_percent_change_from_baseline 77.20
## residential_percent_change_from_baseline 25.18
## grocery_and_pharmacy_percent_change_from_baseline 13.96
## workplaces_percent_change_from_baseline 0.00
varImps <- importance$importance %>% rownames_to_column()
# Plot variable inportance bar chart
varImps %>% ggplot(aes(x = reorder(rowname,-Overall),
y = Overall)) +
geom_col(fill = "#74AEDB",
color = "#4F96CD") +
scale_x_discrete(labels=c("Month","Retail & Recreation","Transit Stations",
"Residential","Grocery & Pharmacy","Workplaces"))+
theme_classic() +
labs(title = "New Delhi's air quality was predominantly influenced by the time of the year rather than
changes in mobility",
subtitle = "Feature importance of mobility categories and month towards levels of NO2 in 2020",
y = "Feature importance weight",
x = "") +
theme(panel.background = element_rect(fill = NA), # Remove gridlines
plot.title = element_text( # Change text format
face = "bold",
size = 12),
plot.subtitle = element_text(
size = 10),
axis.title.x = element_blank(),
axis.text.x = element_text(),
axis.title.y = element_text(),
axis.text.y = element_text(),
aspect.ratio = 3/5)

# CHANGED: whole chunk
train_control <- trainControl (
method="cv",
number=5,
verboseIter=TRUE)
merged_nona %>% ggcorr(layout.position="left")
## Warning in ggcorr(., layout.position = "left"): data in column(s) 'date' are not
## numeric and were ignored
## Warning in cor(data, use = method[1], method = method[2]): the standard
## deviation is zero
## Warning: Ignoring unknown parameters: layout.position

set.seed(1234)
# Fit random forest: model
merged_nona <- lon_merged %>% mutate(month=month(date))
rf_RF <- train(
no2 ~ retail_and_recreation_percent_change_from_baseline+
grocery_and_pharmacy_percent_change_from_baseline+
transit_stations_percent_change_from_baseline+
retail_and_recreation_percent_change_from_baseline+
workplaces_percent_change_from_baseline+
residential_percent_change_from_baseline+
month,
merged_nona,
method = "ranger",
na.action = na.omit,
#metric="ROC",
trControl = train_control,
importance = 'permutation',
tuneLength=10
)
## note: only 5 unique complexity parameters in default grid. Truncating the grid to 5 .
##
## + Fold1: mtry=2, min.node.size=5, splitrule=variance
## - Fold1: mtry=2, min.node.size=5, splitrule=variance
## + Fold1: mtry=3, min.node.size=5, splitrule=variance
## - Fold1: mtry=3, min.node.size=5, splitrule=variance
## + Fold1: mtry=4, min.node.size=5, splitrule=variance
## - Fold1: mtry=4, min.node.size=5, splitrule=variance
## + Fold1: mtry=5, min.node.size=5, splitrule=variance
## - Fold1: mtry=5, min.node.size=5, splitrule=variance
## + Fold1: mtry=6, min.node.size=5, splitrule=variance
## - Fold1: mtry=6, min.node.size=5, splitrule=variance
## + Fold1: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=2, min.node.size=5, splitrule=variance
## - Fold2: mtry=2, min.node.size=5, splitrule=variance
## + Fold2: mtry=3, min.node.size=5, splitrule=variance
## - Fold2: mtry=3, min.node.size=5, splitrule=variance
## + Fold2: mtry=4, min.node.size=5, splitrule=variance
## - Fold2: mtry=4, min.node.size=5, splitrule=variance
## + Fold2: mtry=5, min.node.size=5, splitrule=variance
## - Fold2: mtry=5, min.node.size=5, splitrule=variance
## + Fold2: mtry=6, min.node.size=5, splitrule=variance
## - Fold2: mtry=6, min.node.size=5, splitrule=variance
## + Fold2: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=2, min.node.size=5, splitrule=variance
## - Fold3: mtry=2, min.node.size=5, splitrule=variance
## + Fold3: mtry=3, min.node.size=5, splitrule=variance
## - Fold3: mtry=3, min.node.size=5, splitrule=variance
## + Fold3: mtry=4, min.node.size=5, splitrule=variance
## - Fold3: mtry=4, min.node.size=5, splitrule=variance
## + Fold3: mtry=5, min.node.size=5, splitrule=variance
## - Fold3: mtry=5, min.node.size=5, splitrule=variance
## + Fold3: mtry=6, min.node.size=5, splitrule=variance
## - Fold3: mtry=6, min.node.size=5, splitrule=variance
## + Fold3: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=2, min.node.size=5, splitrule=variance
## - Fold4: mtry=2, min.node.size=5, splitrule=variance
## + Fold4: mtry=3, min.node.size=5, splitrule=variance
## - Fold4: mtry=3, min.node.size=5, splitrule=variance
## + Fold4: mtry=4, min.node.size=5, splitrule=variance
## - Fold4: mtry=4, min.node.size=5, splitrule=variance
## + Fold4: mtry=5, min.node.size=5, splitrule=variance
## - Fold4: mtry=5, min.node.size=5, splitrule=variance
## + Fold4: mtry=6, min.node.size=5, splitrule=variance
## - Fold4: mtry=6, min.node.size=5, splitrule=variance
## + Fold4: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=2, min.node.size=5, splitrule=variance
## - Fold5: mtry=2, min.node.size=5, splitrule=variance
## + Fold5: mtry=3, min.node.size=5, splitrule=variance
## - Fold5: mtry=3, min.node.size=5, splitrule=variance
## + Fold5: mtry=4, min.node.size=5, splitrule=variance
## - Fold5: mtry=4, min.node.size=5, splitrule=variance
## + Fold5: mtry=5, min.node.size=5, splitrule=variance
## - Fold5: mtry=5, min.node.size=5, splitrule=variance
## + Fold5: mtry=6, min.node.size=5, splitrule=variance
## - Fold5: mtry=6, min.node.size=5, splitrule=variance
## + Fold5: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=6, min.node.size=5, splitrule=extratrees
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = variance, min.node.size = 5 on full training set
## mtry min.node.size splitrule RMSE Rsquared MAE RMSESD
## 1 2 5 variance 6.220652 0.3771793 4.749240 0.9403794
## 2 2 5 extratrees 6.229913 0.3761782 4.792067 0.8035583
## 3 3 5 variance 6.336442 0.3614509 4.822445 0.9877370
## 4 3 5 extratrees 6.238146 0.3774324 4.788488 0.9040981
## 5 4 5 variance 6.329772 0.3656350 4.808596 1.0450919
## 6 4 5 extratrees 6.246336 0.3757739 4.811156 0.8957730
## 7 5 5 variance 6.355355 0.3605668 4.818146 1.0295295
## 8 5 5 extratrees 6.260158 0.3738335 4.774717 0.9185210
## 9 6 5 variance 6.402685 0.3560686 4.843809 0.9890533
## 10 6 5 extratrees 6.265499 0.3743974 4.795547 0.8884774
## RsquaredSD MAESD
## 1 0.1831695 0.4762887
## 2 0.1727587 0.4373790
## 3 0.1959177 0.5325098
## 4 0.1875821 0.5175653
## 5 0.1994105 0.5018265
## 6 0.1893933 0.5241401
## 7 0.2008909 0.5100226
## 8 0.1904997 0.5157018
## 9 0.1958405 0.4905065
## 10 0.1892552 0.5355678
## mtry splitrule min.node.size
## 1 2 variance 5
importance <- varImp(rf_RF, scale=TRUE)
importance
## ranger variable importance
##
## Overall
## retail_and_recreation_percent_change_from_baseline 100.000
## workplaces_percent_change_from_baseline 74.370
## transit_stations_percent_change_from_baseline 62.726
## residential_percent_change_from_baseline 46.690
## grocery_and_pharmacy_percent_change_from_baseline 2.391
## month 0.000
varImps <- importance$importance %>% rownames_to_column()
# Plot variable inportance bar chart
varImps %>% ggplot(aes(x = reorder(rowname,-Overall),
y = Overall)) +
geom_col(fill = "#74AEDB",
color = "#4F96CD") +
scale_x_discrete(labels=c("Workplaces", "Transit Stations", "Retail & Recreation", "Residential", "Month", "Grocery & Pharmacy")) +
theme_classic() +
labs(title = "London's air quality was highly affected by mobility patterns rather than the month",
subtitle = "Feature importance of mobility categories and month towards levels of NO2 in 2020",
y = "Feature importance weight",
x = "") +
theme(panel.background = element_rect(fill = NA), # Remove gridlines
plot.title = element_text(, # Change text format
face = "bold",
size = 12),
plot.subtitle = element_text(,
size = 10),
axis.title.x = element_blank(),
axis.text.x = element_text(),
axis.title.y = element_text(),
axis.text.y = element_text())

# CHANGED: whole chunk
train_control <- trainControl (
method="cv",
number=5,
verboseIter=TRUE)
merged_nona %>% ggcorr(layout.position="left")
## Warning in ggcorr(., layout.position = "left"): data in column(s) 'date' are not
## numeric and were ignored
## Warning in cor(data, use = method[1], method = method[2]): the standard
## deviation is zero
## Warning: Ignoring unknown parameters: layout.position

set.seed(1234)
# Fit random forest: model
merged_nona <- zur_merged %>% mutate(month=month(date))
rf_RF <- train(
no2 ~ retail_and_recreation_percent_change_from_baseline +
grocery_and_pharmacy_percent_change_from_baseline +
transit_stations_percent_change_from_baseline +
retail_and_recreation_percent_change_from_baseline +
workplaces_percent_change_from_baseline +
residential_percent_change_from_baseline +
month,
merged_nona,
method = "ranger",
na.action = na.omit,
#metric="ROC",
trControl = train_control,
importance = 'permutation',
tuneLength=10
)
## note: only 5 unique complexity parameters in default grid. Truncating the grid to 5 .
##
## + Fold1: mtry=2, min.node.size=5, splitrule=variance
## - Fold1: mtry=2, min.node.size=5, splitrule=variance
## + Fold1: mtry=3, min.node.size=5, splitrule=variance
## - Fold1: mtry=3, min.node.size=5, splitrule=variance
## + Fold1: mtry=4, min.node.size=5, splitrule=variance
## - Fold1: mtry=4, min.node.size=5, splitrule=variance
## + Fold1: mtry=5, min.node.size=5, splitrule=variance
## - Fold1: mtry=5, min.node.size=5, splitrule=variance
## + Fold1: mtry=6, min.node.size=5, splitrule=variance
## - Fold1: mtry=6, min.node.size=5, splitrule=variance
## + Fold1: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=2, min.node.size=5, splitrule=variance
## - Fold2: mtry=2, min.node.size=5, splitrule=variance
## + Fold2: mtry=3, min.node.size=5, splitrule=variance
## - Fold2: mtry=3, min.node.size=5, splitrule=variance
## + Fold2: mtry=4, min.node.size=5, splitrule=variance
## - Fold2: mtry=4, min.node.size=5, splitrule=variance
## + Fold2: mtry=5, min.node.size=5, splitrule=variance
## - Fold2: mtry=5, min.node.size=5, splitrule=variance
## + Fold2: mtry=6, min.node.size=5, splitrule=variance
## - Fold2: mtry=6, min.node.size=5, splitrule=variance
## + Fold2: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=2, min.node.size=5, splitrule=variance
## - Fold3: mtry=2, min.node.size=5, splitrule=variance
## + Fold3: mtry=3, min.node.size=5, splitrule=variance
## - Fold3: mtry=3, min.node.size=5, splitrule=variance
## + Fold3: mtry=4, min.node.size=5, splitrule=variance
## - Fold3: mtry=4, min.node.size=5, splitrule=variance
## + Fold3: mtry=5, min.node.size=5, splitrule=variance
## - Fold3: mtry=5, min.node.size=5, splitrule=variance
## + Fold3: mtry=6, min.node.size=5, splitrule=variance
## - Fold3: mtry=6, min.node.size=5, splitrule=variance
## + Fold3: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=2, min.node.size=5, splitrule=variance
## - Fold4: mtry=2, min.node.size=5, splitrule=variance
## + Fold4: mtry=3, min.node.size=5, splitrule=variance
## - Fold4: mtry=3, min.node.size=5, splitrule=variance
## + Fold4: mtry=4, min.node.size=5, splitrule=variance
## - Fold4: mtry=4, min.node.size=5, splitrule=variance
## + Fold4: mtry=5, min.node.size=5, splitrule=variance
## - Fold4: mtry=5, min.node.size=5, splitrule=variance
## + Fold4: mtry=6, min.node.size=5, splitrule=variance
## - Fold4: mtry=6, min.node.size=5, splitrule=variance
## + Fold4: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=6, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=2, min.node.size=5, splitrule=variance
## - Fold5: mtry=2, min.node.size=5, splitrule=variance
## + Fold5: mtry=3, min.node.size=5, splitrule=variance
## - Fold5: mtry=3, min.node.size=5, splitrule=variance
## + Fold5: mtry=4, min.node.size=5, splitrule=variance
## - Fold5: mtry=4, min.node.size=5, splitrule=variance
## + Fold5: mtry=5, min.node.size=5, splitrule=variance
## - Fold5: mtry=5, min.node.size=5, splitrule=variance
## + Fold5: mtry=6, min.node.size=5, splitrule=variance
## - Fold5: mtry=6, min.node.size=5, splitrule=variance
## + Fold5: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=3, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=3, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=4, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=4, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=5, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=5, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=6, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=6, min.node.size=5, splitrule=extratrees
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 6, splitrule = extratrees, min.node.size = 5 on full training set
## mtry min.node.size splitrule RMSE Rsquared MAE RMSESD
## 1 2 5 variance 3.000506 0.4496856 2.378327 0.1172283
## 2 2 5 extratrees 3.001319 0.4583348 2.372803 0.1508350
## 3 3 5 variance 2.949843 0.4660592 2.339765 0.1240202
## 4 3 5 extratrees 2.947309 0.4712464 2.329781 0.1496512
## 5 4 5 variance 2.945332 0.4641320 2.340246 0.1234652
## 6 4 5 extratrees 2.930656 0.4767248 2.306389 0.1600960
## 7 5 5 variance 2.945854 0.4617479 2.340699 0.1388317
## 8 5 5 extratrees 2.904380 0.4845298 2.291234 0.1742381
## 9 6 5 variance 2.930042 0.4665792 2.324449 0.1542368
## 10 6 5 extratrees 2.885295 0.4903119 2.270358 0.1750904
## RsquaredSD MAESD
## 1 0.04766182 0.10408569
## 2 0.03503080 0.14642479
## 3 0.06315041 0.09631007
## 4 0.03598054 0.14576105
## 5 0.07534934 0.08003658
## 6 0.04008528 0.16119739
## 7 0.08984957 0.06997849
## 8 0.03821441 0.16997864
## 9 0.09611422 0.07338852
## 10 0.04528749 0.15883673
## mtry splitrule min.node.size
## 10 6 extratrees 5
importance <- varImp(rf_RF, scale=TRUE)
importance
## ranger variable importance
##
## Overall
## month 100.000
## retail_and_recreation_percent_change_from_baseline 10.675
## residential_percent_change_from_baseline 9.253
## transit_stations_percent_change_from_baseline 8.080
## workplaces_percent_change_from_baseline 6.642
## grocery_and_pharmacy_percent_change_from_baseline 0.000
varImps <- importance$importance %>% rownames_to_column()
# Plot variable inportance bar chart
varImps %>% ggplot(aes(x = reorder(rowname,-Overall),
y = Overall)) +
geom_col(fill = "#74AEDB",
color = "#4F96CD") +
scale_x_discrete(labels=c("Month","Retail & Recreation", "Transit Stations", "Residential", "Workplaces", "Grocery & Pharmacy")) +
theme_classic() +
labs(title = "Zurich's air quality stayed largely unaffected by changes in mobility",
subtitle = "Feature importance of mobility categories and month towards levels of NO2 in 2020",
y = "Feature importance weight",
x = "") +
theme(panel.background = element_rect(fill = NA), # Remove gridlines
plot.title = element_text(, # Change text format
face = "bold",
size = 12),
plot.subtitle = element_text(,
size = 10),
axis.title.x = element_blank(),
axis.text.x = element_text(),
axis.title.y = element_text(),
axis.text.y = element_text())
